*** TABULA RASA ***
* clear all

*** FILENAME & FILE-SPECIFIC GLOBALS ***
glo filename 	ieb_wages_prepare
glo date 		= string( d(`c(current_date)'), "%tdCYND" )

*** SETTINGS ***
set more off 
set linesize 200        
set rmsg on            
*set maxvar 32767, permanently
set excelxlsxlargefile on 

/*
*** START LOG-FILE ***
if ${log_active}==1 {
	mac list date
	capture log close
	log using ${log}/${filename}_${date}.log, replace text
	}
*/
	
********************************************************************************
*** ieb_wages_prepare.do **********************************************************
*** Preparation of wage variable(s)*********************************************
*** last change: 2021/1/20 (LH)************************************************
********************************************************************************
*** Author(s): Luke Haywood (lh) and Markus Janser (mj)
*** based on a template of Wolfgang Dauth/Johann Eppelsheimer in cooperation with Johannes Ludsteck 
*** Many thanks to all of them.
*** Template basic version: Version: 1.0, created: 2018-06-01
********************************************************************************
*** Contents:
*** 1. Add contribution assessment ceiling (1975 - 2014)
*** 2. Add Marginal Part-Time Income Threshold and flag affected records (1975 - 2014)
*** 3. Deflate wages, marginal part-time income threshold and contribution assessment ceiling
*** 4. Impute right-censored wages
********************************************************************************

********************************************************************************
*** 1. Add contribution assessment ceiling (1975 - 2014)
********************************************************************************	
/*
Generates the variable:
	- east: 1 if workplace in East Germany (incl. Berlin); 0 if West
*/
		
* Create variable ao_bula (Federal State level, NUTS-1)
cap drop ao_bula
gen ao_bula=int(ao_gem/1000000)
label variable ao_bula "Arbeitsort Bundesland"

cap gen east =.
replace east = 1 if inlist(wo_bula,11,12,13,14,15,16)
replace east = 0 if inlist(wo_bula, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
replace east = 1 if inlist(ao_bula,11,12,13,14,15,16) & east==. & ao_bula!=.
replace east = 0 if inlist(ao_bula, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) & east==. & ao_bula!=.

/*	- limit_assess: contribution assessment ceiling
Notes:
In Germany there is a contribution assessment ceiling ("Beitragsbemessungsgrenze"). Hence, wages are right-cencored.
The generation of the variable limit_assess is based on a FDZ-Arbeitshilfe (http://doku.iab.de/fdz/Bemessungsgrenzen_de_en.xls)
Limits for the years 1975 - 2001 are converted from DM to EUR
*/

	********************************************************************************
	* Set exact assessment ceiling
	********************************************************************************
	gen limit_assess = .
	global exch = 1/1.95583		// exchange rate (EUR/DM)


	* West 1975 - 1991 (incl. Berlin):
	replace limit_assess =  92.05*$exch if jahr == 1975
	replace limit_assess = 101.64*$exch if jahr == 1976
	replace limit_assess = 111.78*$exch if jahr == 1977
	replace limit_assess = 121.64*$exch if jahr == 1978
	replace limit_assess = 131.51*$exch if jahr == 1979
	replace limit_assess = 137.70*$exch if jahr == 1980
	replace limit_assess = 144.66*$exch if jahr == 1981
	replace limit_assess = 154.52*$exch if jahr == 1982
	replace limit_assess = 164.38*$exch if jahr == 1983
	replace limit_assess = 170.49*$exch if jahr == 1984
	replace limit_assess = 177.53*$exch if jahr == 1985
	replace limit_assess = 184.11*$exch if jahr == 1986
	replace limit_assess = 187.40*$exch if jahr == 1987
	replace limit_assess = 196.72*$exch if jahr == 1988
	replace limit_assess = 200.55*$exch if jahr == 1989
	replace limit_assess = 207.12*$exch if jahr == 1990
	replace limit_assess = 213.70*$exch if jahr == 1991

	* West 1992 - 2001 (excl. Berlin):
	replace limit_assess = 222.95*$exch if east == 0 & jahr == 1992
	replace limit_assess = 236.71*$exch if east == 0 & jahr == 1993
	replace limit_assess = 249.86*$exch if east == 0 & jahr == 1994
	replace limit_assess = 256.44*$exch if east == 0 & jahr == 1995
	replace limit_assess = 262.30*$exch if east == 0 & jahr == 1996
	replace limit_assess = 269.59*$exch if east == 0 & jahr == 1997
	replace limit_assess = 276.16*$exch if east == 0 & jahr == 1998
	replace limit_assess = 279.45*$exch if east == 0 & jahr == 1999
	replace limit_assess = 281.97*$exch if east == 0 & jahr == 2000
	replace limit_assess = 286.03*$exch if east == 0 & jahr == 2001

	* West 2002 - 2014 (excl. Berlin):
	replace limit_assess = 147.95 if east == 0 & jahr == 2002
	replace limit_assess = 167.67 if east == 0 & jahr == 2003
	replace limit_assess = 168.85 if east == 0 & jahr == 2004
	replace limit_assess = 170.96 if east == 0 & jahr == 2005
	replace limit_assess = 172.60 if east == 0 & jahr == 2006
	replace limit_assess = 172.60 if east == 0 & jahr == 2007
	replace limit_assess = 173.77 if east == 0 & jahr == 2008
	replace limit_assess = 177.53 if east == 0 & jahr == 2009
	replace limit_assess = 180.82 if east == 0 & jahr == 2010
	replace limit_assess = 180.82 if east == 0 & jahr == 2011
	replace limit_assess = 183.61 if east == 0 & jahr == 2012
	replace limit_assess = 190.68 if east == 0 & jahr == 2013
	replace limit_assess = 195.62 if east == 0 & jahr == 2014
	replace limit_assess = 198.91 if east == 0 & jahr == 2015
	replace limit_assess = 203.84 if east == 0 & jahr == 2016
	replace limit_assess = 208.77 if east == 0 & jahr == 2017
	replace limit_assess = 213.70 if east == 0 & jahr == 2018
	replace limit_assess = 220.28 if east == 0 & jahr == 2019


	/*
		Note: after 1992 Berlin is assigned to East Germany
	*/

	* East 1992 - 2001 (incl. Berlin)
	replace limit_assess = 157.38*$exch if east == 1 & jahr == 1992
	replace limit_assess = 174.25*$exch if east == 1 & jahr == 1993
	replace limit_assess = 193.97*$exch if east == 1 & jahr == 1994
	replace limit_assess = 210.41*$exch if east == 1 & jahr == 1995
	replace limit_assess = 222.95*$exch if east == 1 & jahr == 1996
	replace limit_assess = 233.42*$exch if east == 1 & jahr == 1997
	replace limit_assess = 230.14*$exch if east == 1 & jahr == 1998
	replace limit_assess = 236.71*$exch if east == 1 & jahr == 1999
	replace limit_assess = 232.79*$exch if east == 1 & jahr == 2000
	replace limit_assess = 240.00*$exch if east == 1 & jahr == 2001

	* East 2002 - 2014 (incl. Berlin)
	replace limit_assess = 123.29 if east == 1 & jahr == 2002
	replace limit_assess = 139.73 if east == 1 & jahr == 2003
	replace limit_assess = 142.62 if east == 1 & jahr == 2004
	replace limit_assess = 144.66 if east == 1 & jahr == 2005
	replace limit_assess = 144.66 if east == 1 & jahr == 2006
	replace limit_assess = 149.59 if east == 1 & jahr == 2007
	replace limit_assess = 147.54 if east == 1 & jahr == 2008
	replace limit_assess = 149.59 if east == 1 & jahr == 2009
	replace limit_assess = 152.88 if east == 1 & jahr == 2010
	replace limit_assess = 157.80 if east == 1 & jahr == 2011
	replace limit_assess = 157.38 if east == 1 & jahr == 2012
	replace limit_assess = 161.10 if east == 1 & jahr == 2013
	replace limit_assess = 164.38 if east == 1 & jahr == 2014
	replace limit_assess = 170.96 if east == 1 & jahr == 2015
	replace limit_assess = 177.54 if east == 1 & jahr == 2016
	replace limit_assess = 187.40 if east == 1 & jahr == 2017
	replace limit_assess = 190.69 if east == 1 & jahr == 2018
	replace limit_assess = 202.20 if east == 1 & jahr == 2019

	********************************************************************************
	* Label
	********************************************************************************
	label variable limit_assess "Contribution assessment ceiling"

********************************************************************************
*** 2. Add Marginal Part-Time Income Threshold and flag affected records (1975 - 2014)
********************************************************************************

/*
Generates the variables:
		- limit_marginal: Marginal part-time income threshold
		- marginal: 1 if marginal wage, 0 otherwise
	
Note: Limits for the years 1975 - 2001 are converted from DM to EUR
Based on FDZ Arbeitshilfe (http://doku.iab.de/fdz/Bemessungsgrenzen_de_en.xls)
*/

	********************************************************************************
	* Set limit for marginal wages
	********************************************************************************
	gen limit_marginal = .
	global exch = 1/1.95583		// exchange rate (EUR/DM)

	* West 1975 - 1991 (incl. Berlin):
	replace limit_marginal = 11.51*$exch if jahr == 1975
	replace limit_marginal = 12.70*$exch if jahr == 1976
	replace limit_marginal = 13.97*$exch if jahr == 1977
	replace limit_marginal = 12.82*$exch if jahr == 1978
	replace limit_marginal = 12.82*$exch if jahr == 1979
	replace limit_marginal = 12.79*$exch if jahr == 1980
	replace limit_marginal = 12.82*$exch if jahr == 1981
	replace limit_marginal = 12.82*$exch if jahr == 1982
	replace limit_marginal = 12.82*$exch if jahr == 1983
	replace limit_marginal = 12.79*$exch if jahr == 1984
	replace limit_marginal = 13.15*$exch if jahr == 1985
	replace limit_marginal = 13.48*$exch if jahr == 1986
	replace limit_marginal = 14.14*$exch if jahr == 1987
	replace limit_marginal = 14.43*$exch if jahr == 1988
	replace limit_marginal = 14.79*$exch if jahr == 1989
	replace limit_marginal = 15.45*$exch if jahr == 1990
	replace limit_marginal = 15.78*$exch if jahr == 1991

	* West 1992 - 2001 (excl. Berlin):
	replace limit_marginal = 16.39*$exch if east == 0 & jahr == 1992
	replace limit_marginal = 17.42*$exch if east == 0 & jahr == 1993
	replace limit_marginal = 18.41*$exch if east == 0 & jahr == 1994
	replace limit_marginal = 19.07*$exch if east == 0 & jahr == 1995
	replace limit_marginal = 19.34*$exch if east == 0 & jahr == 1996
	replace limit_marginal = 20.05*$exch if east == 0 & jahr == 1997
	replace limit_marginal = 20.38*$exch if east == 0 & jahr == 1998
	replace limit_marginal = 20.71*$exch if east == 0 & jahr == 1999
	replace limit_marginal = 20.66*$exch if east == 0 & jahr == 2000
	replace limit_marginal = 20.71*$exch if east == 0 & jahr == 2001

	* West 2002 - 2014 (excl. Berlin):
	replace limit_marginal = 10.68 if east == 0 & jahr == 2002
	replace limit_marginal = 13.15 if east == 0 & jahr == 2003
	replace limit_marginal = 13.11 if east == 0 & jahr == 2004
	replace limit_marginal = 13.15 if east == 0 & jahr == 2005
	replace limit_marginal = 13.15 if east == 0 & jahr == 2006
	replace limit_marginal = 13.15 if east == 0 & jahr == 2007
	replace limit_marginal = 13.11 if east == 0 & jahr == 2008
	replace limit_marginal = 13.15 if east == 0 & jahr == 2009
	replace limit_marginal = 13.15 if east == 0 & jahr == 2010
	replace limit_marginal = 13.15 if east == 0 & jahr == 2011
	replace limit_marginal = 13.15 if east == 0 & jahr == 2012
	replace limit_marginal = 14.79 if east == 0 & jahr == 2013
	replace limit_marginal = 14.79 if east == 0 & jahr == 2014
	replace limit_marginal = 14.79 if east == 0 & jahr == 2015
	replace limit_marginal = 14.79 if east == 0 & jahr == 2016
	replace limit_marginal = 14.79 if east == 0 & jahr == 2017
	replace limit_marginal = 14.79 if east == 0 & jahr == 2018
	replace limit_marginal = 14.79 if east == 0 & jahr == 2019
	replace limit_marginal = 14.79 if east == 0 & jahr == 2020

	/*
		Note: after 1992 Berlin is assigned to East Germany because here the assesment ceiling is lower
	*/

	* East 1992 - 2001 (incl. Berlin)
	replace limit_marginal = 09.84*$exch if east == 1 & jahr == 1992
	replace limit_marginal = 12.82*$exch if east == 1 & jahr == 1993
	replace limit_marginal = 14.47*$exch if east == 1 & jahr == 1994
	replace limit_marginal = 15.45*$exch if east == 1 & jahr == 1995
	replace limit_marginal = 16.39*$exch if east == 1 & jahr == 1996
	replace limit_marginal = 17.10*$exch if east == 1 & jahr == 1997
	replace limit_marginal = 17.10*$exch if east == 1 & jahr == 1998
	replace limit_marginal = 20.71*$exch if east == 1 & jahr == 1999
	replace limit_marginal = 20.66*$exch if east == 1 & jahr == 2000
	replace limit_marginal = 20.71*$exch if east == 1 & jahr == 2001

	* East 2002 - 2014 (incl. Berlin)
	replace limit_marginal = 10.68 if east == 1 & jahr == 2002
	replace limit_marginal = 13.15 if east == 1 & jahr == 2003
	replace limit_marginal = 13.11 if east == 1 & jahr == 2004
	replace limit_marginal = 13.15 if east == 1 & jahr == 2005
	replace limit_marginal = 13.15 if east == 1 & jahr == 2006
	replace limit_marginal = 13.15 if east == 1 & jahr == 2007
	replace limit_marginal = 13.11 if east == 1 & jahr == 2008
	replace limit_marginal = 13.15 if east == 1 & jahr == 2009
	replace limit_marginal = 13.15 if east == 1 & jahr == 2010
	replace limit_marginal = 13.15 if east == 1 & jahr == 2011
	replace limit_marginal = 13.15 if east == 1 & jahr == 2012
	replace limit_marginal = 14.79 if east == 1 & jahr == 2013
	replace limit_marginal = 14.79 if east == 1 & jahr == 2014
	replace limit_marginal = 14.79 if east == 1 & jahr == 2015
	replace limit_marginal = 14.79 if east == 1 & jahr == 2016
	replace limit_marginal = 14.79 if east == 1 & jahr == 2017
	replace limit_marginal = 14.79 if east == 1 & jahr == 2018
	replace limit_marginal = 14.79 if east == 1 & jahr == 2019
	replace limit_marginal = 14.79 if east == 1 & jahr == 2020

	********************************************************************************
	* Flag marginal wages
	********************************************************************************
	gen marginal = 0
	replace marginal = 1 if tentgelt <= limit_marginal

	********************************************************************************
	* Labels
	********************************************************************************
	label variable limit_marginal "Marginal part-time income threshold"
	label variable marginal "1 if marginal wage, 0 otherwise"


********************************************************************************
*** 3. Deflate wages, marginal part-time income threshold and contribution assessment ceiling
********************************************************************************

/*
Consumer Price Index:
	Statistisches Bundesamt (2016)
	Preise - Verbraucherpreisindizes fuer Deutschland (Lange Reihe ab 1948)
	https://www.destatis.de/DE/Publikationen/Thematisch/Preise/Verbraucherpreise/VerbraucherpreisindexLangeReihen.html

	
Generates the variables:
	- wage_defl: daily wage, deflated
	- limit_marginal_defl: marginal part-time income threshold, deflated
	- limit_assess_defl: contribution assessment ceiling, deflated

	* (6a) Deflating income information
* Comment:
*	The CPI index is taken from the OECD database. It refers to the national index. *
		
		ALTERNATIVES: 
		a) DESTATIS (https://www-genesis.destatis.de/)
		61111-0010 Verbraucherpreisindex: Bundesländer, Jahre
		Verfügbarer Zeitraum: 1995 - 2018 
		OR
		b) BBSR: CPI at county level
	*/

	********************************************************************************
	* Consumer Price Index West Germany 1975 - 1991 (base year 1995)
	********************************************************************************
	gen cpi=.
	label variable cpi "Consumer Price Index"

	replace cpi = 54.5 if jahr == 1975
	replace cpi = 56.8 if jahr == 1976
	replace cpi = 58.9 if jahr == 1977
	replace cpi = 60.5 if jahr == 1978
	replace cpi = 63.0 if jahr == 1979
	replace cpi = 66.4 if jahr == 1980
	replace cpi = 70.6 if jahr == 1981
	replace cpi = 74.3 if jahr == 1982
	replace cpi = 76.7 if jahr == 1983
	replace cpi = 78.6 if jahr == 1984
	replace cpi = 80.2 if jahr == 1985
	replace cpi = 80.1 if jahr == 1986
	replace cpi = 80.3 if jahr == 1987
	replace cpi = 81.3 if jahr == 1988
	replace cpi = 83.6 if jahr == 1989
	replace cpi = 85.8 if jahr == 1990
	replace cpi = 89.0 if jahr == 1991

	********************************************************************************
	* Set 2010 as base year for the period 1975 - 1991
	********************************************************************************
	replace cpi = (cpi/89.0)*70.2		// see Statistisches Bundesamt (2016)

	********************************************************************************
	* Consumer Price Index Germany (West and East, after 1991)
	********************************************************************************
	replace cpi =  73.8 if jahr == 1992
	replace cpi =  77.1 if jahr == 1993
	replace cpi =  79.1 if jahr == 1994
	replace cpi =  80.5 if jahr == 1995
	replace cpi =  81.6 if jahr == 1996
	replace cpi =  83.2 if jahr == 1997
	replace cpi =  84.0 if jahr == 1998
	replace cpi =  84.5 if jahr == 1999
	replace cpi =  85.7 if jahr == 2000
	replace cpi =  87.4 if jahr == 2001
	replace cpi =  88.6 if jahr == 2002
	replace cpi =  89.6 if jahr == 2003
	replace cpi =  91.0 if jahr == 2004
	replace cpi =  92.5 if jahr == 2005
	replace cpi =  93.9 if jahr == 2006
	replace cpi =  96.1 if jahr == 2007
	replace cpi =  98.6 if jahr == 2008
	replace cpi =  98.9 if jahr == 2009
	replace cpi = 100.0	if jahr == 2010
	replace cpi = 102.1 if jahr == 2011
	replace cpi = 104.1 if jahr == 2012
	replace cpi = 105.7 if jahr == 2013
	replace cpi = 106.6 if jahr == 2014
	replace cpi = 106.9 if jahr == 2015
	replace cpi = 107.4 if jahr == 2016
	replace cpi = 109.3 if jahr == 2017
	
	sum tentgelt, d	
			
	*********************************************************************************************************
	* Deflate wages, marginal part-time income threshold and contribution assessment ceiling (base year 2010)
	*********************************************************************************************************
	gen limit_marginal_defl = 100 * limit_marginal / cpi
	gen limit_assess_defl   = 100 * limit_assess / cpi

	label variable limit_marginal_defl "Marginal part-time income threshold, deflated (2010)"
	label variable limit_assess_defl "Contribution assessment ceiling, deflated (2010)"

	
********************************************************************************
*** 4. Impute right-censored wages
********************************************************************************
********************************************************************************
* Prepare control variables for imputation
********************************************************************************

/*
	Comment by original IAB file authors:
	In principle all variables from the actual analyses should be in the 
	imputation as well. Ommiting controls from the scientific analysis
	in the imputation might result in biased estimates. This is case 
	whether wage is the depenent or an independent variable.	
	It is left to us to complete or modify the list of control variables.	
	Below we run separat regression for the selected groups. Hence, it might also 
	be reasonable to modify these groups, too.
*/

* Education
gen educTmp = bild
label values educTmp lblValEduc
replace educTmp = 0 if missing(bild)	// code missings as 0
tab educTmp, gen(DeducTmp_)
sum educTmp
global minEduc = r(min)
global maxEduc = r(max)

gen age_sq = (agebeg/10)^2 			// express age squared in 100 years (to have readable coefficients in the regressions)
gen old  = agebeg > 40				// dummy for "older" people
gen age_old = agebeg * old 			// different age profiles for young and old workers
gen age_sq_old = age_sq * old		// age_old squared
gen tenadj=genexp_help/1000
label var tenadj "labour market experience divided by 1000"
gen tenadj_sq = tenadj^2	// tenure squared

sum frau old agebeg age_sq age_old age_sq_old tenadj tenadj_sq

* control variables for imputation:

global ind old agebeg age_sq age_old age_sq_old tenadj tenadj_sq Dbild_*	//individual regressors
	
* Count missings in control variables
foreach var of varlist $ind {
	disp "missings in `var':"
	count if missing(`var') & quelle == 1
}	
	
*------------------------------------------------------------------------------*
*	Preparation of Group Variables 
*------------------------------------------------------------------------------*
global group jahr east frau // here we do not use education, because highly educated male workers 

* Count missings in control variables
foreach var of varlist $group {
	disp "missings in `var':"
	count if missing(`var') & quelle == 1
}	
	
* Years & Decades
sum jahr
global minYear = 1975 // was "r(min)", but deliverd 1956 due to data error
global maxYear = r(max)

g decade=.
replace decade=floor((jahr-1900)/10)
global minDec=floor((${minYear}-1900)/10)
global maxDec=floor((${maxYear}-1900)/10)


* Ost
tab east if quelle==1,m
sum east
global minEast = r(min)
global maxEast = r(max)

* Gender
tab frau if quelle==1,m
sum frau
global minGender = r(min)
global maxGender = r(max)

*------------------------------------------------------------------------------*
*	Deflation
*------------------------------------------------------------------------------*

** Deflate wage and assessment limits
gen tentgeltDefl = 100 * tentgelt/cpi
gen limitMarginalDefl = 100 * limit_marginal/cpi
gen limitAssessDefl = 100 * limit_assess/cpi
label variable tentgeltDefl 		"tentgelt, deflated (2010)"
label variable limitMarginalDefl 	"Marginal part-time income threshold, deflated (2010)"
label variable limitAssessDefl 		"Contribution assessment ceiling, deflated (2010)"

* Modify assessment limit 
** substract 4 EUR of exact assessment limit (to make sure all cencored wages are covered by the imputation)
gen limitAssess4 = limitAssessDefl - (100 * 4/cpi)								// ... deflate 4 EUR first
gen lnLimitAssess4 = log(limitAssess4)
label variable limitAssess4			"Contribution assessment ceiling minus 4 EUR"
label variable lnLimitAssess4 		"Log of contribution assessment ceiling minus 4 EUR"

* Flag censored wages
gen cens = 0
replace cens = 1 if tentgeltDefl > limitAssess4 & quelle == 1
label variable cens 				"Censoring Indicator: 1 if right-censored(4 EUR below assessment ceiling)"

*------------------------------------------------------------------------------*
*	Prepare Wage Variable
*------------------------------------------------------------------------------*
* "Wage" is top-coded-replaced pre-imputatoin wage
gen wage = tentgeltDefl 	if quelle == 1
replace wage = limitAssess4 if cens==1
label var wage 						"Tentgelt, not imputed, top-coded replaced, deflated"

* log-wage
gen lnWage = log(wage)				
lab var lnWage						"Log daily wage, not imputed, top-coded replaced"

* log-wages with censored wages reported as missing
gen ln_wage_cens = lnWage if cens == 0
lab var ln_wage_cens				"Log daily wage, not imputed, top-coded replaced as missings"

*------------------------------------------------------------------------------*
*	Overview of censored wages
*------------------------------------------------------------------------------*
* Overall censoring
tab cens if quelle==1, m
tab jahr cens if quelle==1, m

* Censoring by education
tab cens educTmp if quelle==1, m

* Censoring by age
forvalues a = 20(10)60 {
	di "Age < `a' years"
	tab cens if agebeg < `a' & quelle == 1
}
	* censorig of high-skilled workers by age groups
	forvalues a = 25(5)60 {
		disp "subsample high-skilled: age < `a':"
		tab cens if educTmp == 3 &  agebeg < `a' & quelle == 1
	}


* censoring by gender
tab cens frau if quelle == 1, m

tab cens east if decade==7,m
tab cens frau if decade==7,m
tab east frau if decade==7,m

tab cens east if decade==8,m
tab cens frau if decade==8,m
tab east frau if decade==8,m


* Observation count for subgroups for wage imputation - taken out: last loop 		forvalues p = $minPart/$maxPart {... } ...& Dteilzeit==`p'... & Dteilzeit==`p'
forvalues ydec = $minDec/$maxDec {
	forvalues e = $minEast/$maxEast {
		forvalues g = $minGender/$maxGender {	
			di "Subsample: ## decade `ydec' ##  ## east `e' ## ## gender `g' ## "
			count if decade== `ydec' & east==`e' &  frau==`g' 	& quelle==1	
		}
		}
		}
		
save ${data}/w2_test-wages-before-IMPUTATION.dta, replace 
*use ${data}/w2_test-wages-before-IMPUTATION.dta, clear 
		
********************************************************************************
* Step 1: imputation with observable characteristics 
*         (for details refer to Gartner (2005))
********************************************************************************
* taken out: last loop forvalues p = $minPart/$maxPart {...	 }

cap drop ln_wage_imp
gen ln_wage_imp = .

rename lnWage ln_wage

display "Start of wage imputation Loop $S_DATE $S_TIME"
forvalues ydec = $minDec/$maxDec {		
	display "$S_DATE $S_TIME: Imputation for year `ydec'"
	forvalues e = $minEast/$maxEast {
		forvalues g = $minGender/$maxGender {		
			intreg ln_wage ln_wage_cens $ind if decade == `ydec' & east==`e' & frau==`g' 	
																	// Tobit regression with right-censored data (since lnWageC has missings) 
			global sdi = e(sigma)									// save variance of residual from Tobit
			predict xbn if e(sample), xb							// predict xb from observables
			gen eta = (lnLimitAssess4 - xbn) / $sdi if e(sample)	// eta is a residual with standard normal distribution
			gen ln_wage_tmp = ln_wage if cens == 0						// note: do not use if e(sample) here; otherwise uncensored wages of obs. with missing covariables get dropped
			replace ln_wage_tmp = xbn + ///												// take xb from regression and add normally distributed random term
						$sdi * invnorm(normal(eta) + uniform()*(1-normal(eta))) ///		// with s.d.=sigma, that must be larger than social security contribution ceiling
						if e(sample) & cens == 1
			replace ln_wage_imp = ln_wage_tmp if decade == `ydec' & east==`e' & frau==`g' // taken out: & Dteilzeit==`p' 
			count if ln_wage_imp < ln_wage & e(sample)					// make sure imputation produces no values below social security contribution ceiling
			if r(N)>0 di in red r(N) "imputed wage(s) below censoring limit in decade " `ydec' "and east" `e' " and gender " `g' "" // taken out: and parttime " `p'  
			drop xbn eta ln_wage_tmp
			}
		}
	}


display "End of wage imputation Loop $S_DATE $S_TIME"

*------------------------------------------------------------------------------*
*	Imputed wages in levels
*------------------------------------------------------------------------------*
gen wage_imp = exp(ln_wage_imp)				
label variable wage_imp	"Daily wage, imputed"
lab var ln_wage_imp		"Log daily wage, imputed"

g wage_preimp= tentgeltDefl
*------------------------------------------------------------------------------*
*	Overview/Comparison of wages and imputed wages
*------------------------------------------------------------------------------*
* preimp-wage / top-code replace wage / postimp wage
foreach v in wage_preimp wage wage_imp {
g `v'_permonth= `v'*tentgeltdays
}


** Summary statistics - repeat this in 5estimations where
** can differentiate between coal and non-coal wages **
**
** wage_defl is pre-imputation
** wage      is pre-imputation with top-coded replaced
** wage_imp	 is post-imputation

sum wage_preimp wage wage_imp wage_preimp_permonth wage_permonth wage_imp_permonth, det
sum wage_preimp_permonth wage_permonth if cens == 1, det
sum wage_preimp_permonth wage_permonth wage_imp_permonth if cens == 0, det
sum wage_preimp_permonth wage_permonth wage_imp_permonth if decade == $maxDec
sum wage_preimp_permonth wage_permonth wage_imp_permonth if cens == 1 & decade == $maxDec

* Correlations
corr wage*
corr wage* if cens == 1
corr wage* if cens == 0

* Graphical comparison: histogram of wages and imputed wages (last year, 5 years prior, 10 years prior, 15 years prior, ...)
set graphics on
forvalues y = $maxYear (-5) $minYear {
	cap drop temp_x1 temp_h1 temp_x2 temp_h2
	twoway__histogram_gen wage 			if jahr == `y' & !missing(wage_imp), start(0) width(5) gen(temp_h1 temp_x1)
	twoway__histogram_gen wage_imp   	if jahr == `y' , start(0) width(5) gen(temp_h2 temp_x2)

	twoway ///
		(bar temp_h1 temp_x1 if inrange(temp_x1,0,300), barwidth(5) color(khaki)) 				///
		(bar temp_h2 temp_x2 if inrange(temp_x2,0,300), barwidth(5) fcolor(none) lcolor(navy) ) ///
		, legend(rows(1) order(1  "Raw" 2 "Imputed")) title("Distribution of wages in " `y') ///
		  xlabel(0(50)300) graphregion(color(white)) 
	global	graphnameA03 = "${graphs}"+"\"+"nameA03_wage_imputation_`y'"
	graph save $graphnameA03.gph, replace
	graph export $graphnameA03.png, replace
	drop temp_x1 temp_h1 temp_x2 temp_h2
}
set graphics off

*------------------------------------------------------------------------------*
*	Clean up
*------------------------------------------------------------------------------*
** Drop unnecessary controls (only necessary for wage imputation)

drop old age_sq age_old age_sq_old tenadj tenadj_sq ///
		educTmp DeducTmp_* ///
		limit_assess limit_marginal limit_marginal_defl limit_assess_defl limitAssess4 lnLimitAssess4

	 
********************************************************************************
display "End of Wage Imputation: $S_TIME  $S_DATE"

** Observation count
distinct persnr
distinct persnr if begepi<=${cdate} & endepi>=${cdate} & quelle==1

distinct betnr 
distinct betnr if begepi<=${cdate} & endepi>=${cdate} & quelle==1


*** End of subfile "ieb_wages_prepare" ***


